Library loading


In [1]:
import os

# for handling dataframe
import re
import csv
import pandas as pd
import numpy as np

# for BOW
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# for wordcloud
import matplotlib.pylab as plt
from wordcloud import WordCloud
from PIL import Image

# working directory setting
work_dir = 'D:/Document/project/HYStudy/scripts'
os.chdir(work_dir)

Data loading


In [2]:
raw_text = pd.read_csv('[HYStudy 17th] ex_data.csv', encoding='utf-8', names=['content'])
raw_text.head()


Out[2]:
content
0 대리점 폰 사려 꼭알아가야할점 그런것들있나 제 지금 gpro2 쓰 넘 약정 끝나 그...
1 위약4 질문 g5 제 ㅎㅇ 개통 599유 조건 그 유지 끝내 나 폰 바꾸 되 저 청...
2 가격대 중고가20만원 포함 추천 이어폰 어느정도 후보군 추려 er4 ue900 트파...
3 잠깐 기기 위약금 나오 제 g5 새기 생기 통신사 skt 유심 하려 보 g5새 lg...
4 g5 진열 되 상태 폰 못쓸거같은데 예 도색 벗기 요

Make Corpus


In [3]:
corpus = np.array(raw_text['content'])
print(len(corpus))
print(corpus[0:3])

raw_text.tail()


8349
[ '대리점 폰 사려 꼭알아가야할점 그런것들있나 제 지금 gpro2 쓰 넘 약정 끝나 그 폰 발열 하구 베터리 따르 엇보 와이파이 접촉 불량때문 와이파이 켜 않 이참 폰 바꾸 하 노트5 g5 생각 g5 평이 너무 안좋더 노트5생 요즘 노트5 대리점 구입 얼마인가 kt쓰고있 기기 하 생각 폰 법 바뀌 나 구매 알아야할점 그런것들있나 네이버 치 노트5 정도 하 것 맞 요즘 69했을경우 음 제 대충 보기 요금 따르 공시 바뀌 요금 비싼요금제 시작 한달 쓰 바 바꾸 되 이런것들 추가적 있 대리점 요금 구매 혜택 있 블루스 그런곳 바꾸 정보 주세 사 당하 하 흑'
 '위약4 질문 g5 제 ㅎㅇ 개통 599유 조건 그 유지 끝내 나 폰 바꾸 되 저 청구 위약4 3 ㅎㅇ완납 4 금액 550 사용 제외 730 이 되 ㅎㅇ 개통 g5 사용 다시 번이했 경우 위 금액만큼 제 물 맞 문의 드립 다'
 '가격대 중고가20만원 포함 추천 이어폰 어느정도 후보군 추려 er4 ue900 트파 포낙 보스 h3 등 있 무선 제이버드 akg 브라 rox sbh80 정도 있나봐 추천 대브븐 유선 해주 무선 음악 리면 되는거 apt x 지원 sbh80 유닛 소리 어느정도 간음 안되 비교 분 계신 그리 g5 이번 나 리시버 o 연결 위 후보군 성능 다 올라가능건가 니 b o 성향 맞추 유닛 가리 쓰 분 계신 요']
Out[3]:
content
8344 카우붐 마지막 수령 레노 g50 amd 램6기 모델 비닐 다 안떼졌 베젤 극 기스 ...
8345 라온티앤아 타무즈 스톤 x 사용 싼 게이밍 마우스 다를봐 없 게이밍 라온티앤아 타무...
8346 만약 ㅎㅇ 핸드폰 구입 핸드폰 새 사 되 크 ㅂㅇ ㅎㅇ v10 을 구입 초 g5 나...
8347 cube t8 plus noroot 내장메모리 통합 순정 리커버리 sd 해제 cub...
8348 노트북 살 하 사야 하 모르 업무용 가지 녀 집 사용 게임 던파 가끔 하 현재 사용...

Make BOW


In [4]:
# except 1-letter word
## min_df: integer(frequency), float(ratio)
tf_vectorizer = CountVectorizer(min_df = 0.001, token_pattern=r'\w{2,}')

tf_corpus = tf_vectorizer.fit(corpus)
tf_bow = tf_vectorizer.fit_transform(corpus)
tf_bow


Out[4]:
<8349x3578 sparse matrix of type '<class 'numpy.int64'>'
	with 193856 stored elements in Compressed Sparse Row format>

In [5]:
tfidf_vectorizer = TfidfVectorizer(min_df = 0.001, token_pattern=r'\w{2,}')

tfidf_corpus = tf_vectorizer.fit(corpus)
tfidf_bow = tfidf_vectorizer.fit_transform(corpus)
tfidf_bow


Out[5]:
<8349x3578 sparse matrix of type '<class 'numpy.float64'>'
	with 193856 stored elements in Compressed Sparse Row format>
  • TF vectorizer

In [6]:
# check vocabulary in TDM
print(len(tf_corpus.vocabulary_))
print(len(tf_corpus.get_feature_names()))
tf_vectorizer.get_feature_names()[1000:1010]


3578
3578
Out[6]:
['대폭', '대하', '대학생', '대한민국', '대해', '대형', '대화면', '댓글', '더럽', '더불']

In [7]:
# frequency count
tf_word_sum = tf_bow.toarray().sum(axis=0)
tf_word_name = tf_corpus.get_feature_names()
tf_word_dict = {}

for i in range(len(tf_word_sum)):
    tf_word_dict[tf_word_name[i]] = tf_word_sum[i]
    
print(tf_corpus.get_feature_names()[150:160])
print(tf_word_sum[150:160])


['g5쪽', 'g5출시', 'g5카메', 'g5하', 'g5후', 'g6', 'g7', 'g7x', 'galaxy', 'gk']
[ 9 16 11  9  9 66 11 15 54 18]

In [8]:
# word & index number
tf_word_dict


Out[8]:
{'기록': 27,
 '계약서': 18,
 '마시': 20,
 '그래픽카드': 45,
 '고민': 874,
 '괜찮다': 29,
 '나을': 82,
 '옮기': 106,
 '안타깝': 39,
 '선택지': 22,
 '광주': 17,
 '철회': 21,
 '문의': 180,
 '노트북': 146,
 '화이트': 82,
 '유플러스': 123,
 '일하': 19,
 '기본': 429,
 '구매자': 55,
 '하이마트': 171,
 '홍보': 38,
 'ㅌㅋㄴ': 25,
 '64gb': 27,
 'ls2d': 22,
 '경기': 13,
 '위치': 108,
 '고정이': 12,
 '먼지': 59,
 '메뉴': 42,
 '국내': 199,
 '편한': 37,
 '평소': 77,
 '싼맛': 10,
 '손가락': 50,
 '날씨': 52,
 '입체감': 10,
 '화웨이': 37,
 '붙이': 98,
 '효도': 10,
 '시계': 39,
 '방진': 14,
 '분리형': 16,
 '만나': 42,
 'a7': 73,
 '좋을거': 18,
 '클리앙': 11,
 '인터파크': 18,
 '찍기': 19,
 '답답': 84,
 '매달': 38,
 '무겁': 41,
 '퀵커버': 20,
 '전국': 10,
 '의향': 9,
 'mode': 152,
 '정리': 84,
 '국민': 18,
 '일주일': 95,
 '특별': 44,
 '묻히': 10,
 '웹서핑': 73,
 '튼튼': 27,
 '점이': 69,
 'dslr': 41,
 '가죽': 30,
 '대부분': 138,
 '높다': 14,
 '작업': 44,
 '예약': 143,
 '진심': 16,
 '사과': 27,
 '칩셋': 13,
 'cat6': 21,
 '택배': 151,
 '어둡': 119,
 '노트5': 245,
 '이번': 863,
 '웨이즈': 69,
 '저번': 34,
 '특성': 15,
 '지난번': 12,
 '파손': 32,
 '모듈빼': 9,
 '채우': 32,
 'kt': 460,
 '단말': 31,
 '사업부': 14,
 '안되는거': 24,
 '게다': 53,
 '구해': 31,
 '조립': 27,
 '장기적': 10,
 '거리': 65,
 '탐나': 27,
 'ㅂㅇ': 79,
 '내야': 27,
 '달라': 109,
 '가입': 150,
 '판매': 195,
 '해상도': 110,
 '되지': 105,
 '통하': 114,
 '가기': 45,
 '이럴': 11,
 '판정': 21,
 '반응속도': 14,
 '저가형': 19,
 '정상적': 38,
 '유투브': 42,
 '적응': 93,
 '절약': 13,
 'ㄷㄷ': 123,
 '학생': 14,
 '인봉': 32,
 '광탈': 46,
 '활성화': 22,
 '은거': 11,
 '스마트워치': 16,
 '이부': 31,
 '안주': 13,
 '스냅': 87,
 '길이': 42,
 '조사': 10,
 '마구': 10,
 '인해': 66,
 'techholic': 9,
 '다시': 637,
 '싸구려': 19,
 'ㅅㄷㄹ가': 11,
 '감기': 12,
 '대기': 58,
 '하도': 37,
 '하필': 12,
 '가격대': 80,
 '단통': 11,
 '빠르': 235,
 '없애': 48,
 '신경': 133,
 '귀찮': 118,
 '절대': 69,
 '자극': 11,
 '이르': 56,
 '편차': 12,
 '사실상': 44,
 '작동': 87,
 '바뀌': 116,
 '교품증': 185,
 '한마디': 18,
 '후속': 31,
 '취소': 59,
 '아그': 9,
 '제이슨': 10,
 '다운': 81,
 '생김': 10,
 '수채화': 29,
 '중간중간': 11,
 '인상': 10,
 '받으': 10,
 '갤수육': 15,
 '내년': 22,
 '운영': 19,
 'audio': 14,
 '정착': 13,
 '모듈': 2625,
 '처리': 68,
 '개취': 35,
 '확대': 41,
 '점유율': 27,
 '어머니': 125,
 '현존': 16,
 '물리': 51,
 '서류': 23,
 '방출': 13,
 '못생기': 19,
 '세로': 24,
 '현명': 13,
 '겨울': 11,
 '각종': 25,
 '땡기': 42,
 '공간': 28,
 '총평': 14,
 '인간': 10,
 '해주시': 27,
 '생길': 40,
 '사무실': 26,
 '규모': 13,
 '오더': 9,
 '설명': 112,
 '드릴': 29,
 'sid2': 23,
 '알아보다': 21,
 '마감': 153,
 '발급': 34,
 '보다': 61,
 '사은품': 171,
 '장소': 13,
 '다음날': 24,
 '빠르다': 11,
 '심지': 44,
 '실수': 38,
 '그럼': 23,
 '발표': 86,
 '번이': 322,
 '옵션': 42,
 '울산': 11,
 '남지': 10,
 '하단부': 48,
 '광각': 530,
 '하나하나': 10,
 '아저씨': 13,
 '하이': 19,
 '각도': 15,
 '보스': 11,
 '매년': 10,
 '갤6': 107,
 '스타': 20,
 '어떠': 57,
 '안정감': 12,
 '망작': 9,
 '빠릿': 22,
 '펌웨어': 27,
 '일반인': 34,
 '꺼지': 59,
 '불가능': 62,
 '쓰지': 45,
 '아침': 66,
 '주지': 16,
 '탑재': 115,
 '물품': 12,
 '오늘': 994,
 '전화로': 11,
 '부2': 41,
 '하지않': 9,
 '먼저': 112,
 '프렌즈': 115,
 '내면': 27,
 '근래': 13,
 '주면': 18,
 '넘사벽': 17,
 'ㄹㄱㅂㅇ': 97,
 '인정': 37,
 '과장': 13,
 '마마무': 21,
 '산지': 35,
 '동생': 57,
 '감감무소식': 9,
 '양품': 213,
 '변경': 164,
 '하시': 212,
 '하이브리드': 13,
 'a4용지': 15,
 '국민카드': 31,
 'edge': 40,
 '케이스': 677,
 '백그라운드': 13,
 '이것이': 11,
 '사놓': 12,
 '등에': 22,
 '현실': 39,
 '프로세서': 20,
 'g540': 17,
 '각각': 34,
 'qc': 69,
 '중요시': 12,
 'galaxy': 54,
 '그정': 33,
 '6in': 12,
 'htc': 32,
 '고정': 53,
 '유리': 57,
 '채택': 47,
 '목적': 36,
 '롤링': 28,
 '삼성': 941,
 'ㅅㅋ': 69,
 '스크린': 42,
 '욕심': 29,
 '그림': 20,
 '덕분': 49,
 '중요': 126,
 '초중반': 10,
 '밤에': 35,
 '지속적': 12,
 '아내': 16,
 '9ghz': 9,
 '신경안쓰': 12,
 '메모리': 94,
 '모델명': 12,
 '노이즈': 55,
 '보다보': 34,
 '인수': 11,
 '고르': 62,
 '나오길': 18,
 '남자': 45,
 '물에': 13,
 '아식스': 19,
 '글쓰': 22,
 '고집': 17,
 '반해': 16,
 '이젠': 65,
 '방식': 146,
 '사정': 10,
 '은근': 42,
 '단통법': 126,
 '보여주': 73,
 '외관': 116,
 '오고': 24,
 '저장': 53,
 '위하': 185,
 '연락': 100,
 '지우': 22,
 '때까지': 9,
 '아가': 17,
 '대화면': 10,
 '신경쓰': 79,
 '커지': 24,
 '서비스': 126,
 '용어': 20,
 '얼른': 33,
 '감사': 139,
 '차후': 20,
 '기업': 60,
 '2년정': 10,
 '오해': 15,
 '상상': 19,
 '부무': 36,
 '완납': 56,
 '서랍': 13,
 '업무': 15,
 '해외': 105,
 '올라가': 28,
 '현완': 69,
 '귀가': 19,
 '치명적': 36,
 '이거': 426,
 '128gb': 10,
 '하반기': 15,
 '상태': 268,
 '안정성': 9,
 '커뮤니티': 21,
 '엄청': 370,
 '뒤지': 17,
 '현실적': 11,
 '정품': 62,
 '갤s7': 91,
 'url': 12,
 '선보': 21,
 'sd': 96,
 '보류': 10,
 '일일이': 11,
 '114': 13,
 '인터페이스': 15,
 '마케팅': 89,
 '이어폰': 576,
 '분위기': 60,
 '정확': 110,
 '맨날': 22,
 '저항': 16,
 '유지': 182,
 '원활': 11,
 '팍팍': 11,
 '다가오': 20,
 '서비스센터': 181,
 'be': 36,
 '떨어지': 123,
 '착한': 10,
 '하루': 144,
 '해결방법': 10,
 '트렌드': 14,
 '기왕': 10,
 '극복': 12,
 '네트워크': 10,
 'vga': 12,
 '리모콘': 14,
 '현대': 18,
 '단점': 233,
 '폰값': 11,
 '오류': 53,
 '난리': 28,
 '등록': 33,
 '계기': 12,
 '개인적': 365,
 'naver': 140,
 'for': 82,
 '5se': 12,
 'laptop': 20,
 '중앙': 15,
 '자체': 240,
 '연결': 335,
 '여론': 9,
 '스토어': 11,
 '중국': 78,
 '앰프': 14,
 '유선': 34,
 '눈팅': 77,
 'electronics': 56,
 '완전체': 10,
 '어짜피': 30,
 '구경': 55,
 '방탄': 12,
 '베샵': 31,
 '인거': 21,
 '귀엽': 11,
 '카드결제': 20,
 '하나': 358,
 '적지': 10,
 '기사': 56,
 '포함': 107,
 '건너': 10,
 '정보공유': 10,
 '구글': 89,
 '어려': 25,
 '해당': 85,
 'asrock': 10,
 '잠금': 42,
 '나타내': 26,
 '데이트': 128,
 '열흘': 15,
 '지프': 40,
 '대세': 21,
 '나르': 211,
 '허허': 22,
 '노트3': 91,
 '빠릿하': 12,
 '겔럭시': 16,
 '플레이': 65,
 'sw': 9,
 '발생': 175,
 '지네': 11,
 '미디어': 18,
 '지점': 17,
 '마트': 10,
 '꺼리': 10,
 '상단': 103,
 '돌아가': 50,
 '좋다': 128,
 '게임': 469,
 '생기': 226,
 '분리': 152,
 '카드': 216,
 '빠지': 117,
 '쌓이': 9,
 '예상': 145,
 '성향': 18,
 '땡겨': 9,
 '개철': 58,
 '제시': 13,
 '샀다': 21,
 '능력': 14,
 'aspx': 14,
 '한데': 143,
 '희망': 20,
 '그때': 38,
 '강력': 17,
 '듣기': 21,
 '반대쪽': 15,
 '강제': 25,
 '아마': 30,
 '허접': 18,
 'lcd': 79,
 '있을까': 175,
 '일정': 44,
 '유일': 30,
 '간지': 11,
 'gpro2': 16,
 '롤리팝': 26,
 '주변기기': 29,
 '볼땐': 10,
 '수요': 13,
 '로지텍': 26,
 '구합': 10,
 '가족': 79,
 '와중': 17,
 '어떨까': 20,
 '원가': 14,
 '흥미': 20,
 '디자인과': 25,
 '순위': 16,
 '결론적': 23,
 '이상은': 24,
 '주변': 85,
 'of': 89,
 '검색': 193,
 '599요금제': 34,
 '한시': 19,
 '간단': 148,
 '쥐5': 26,
 '확정': 21,
 '어이': 10,
 '더하': 44,
 '직영점': 49,
 '부정적': 13,
 '공짜': 60,
 '걸치': 9,
 '고치': 17,
 '기회': 38,
 '측정': 61,
 '버벅': 60,
 '장착': 167,
 'samsung': 17,
 '스냅드래곤': 37,
 '아니': 1532,
 '버젼': 12,
 '성공': 100,
 '설레': 15,
 '한쪽': 62,
 '남기': 74,
 '잡음': 25,
 '한번': 124,
 '이제': 468,
 '정책': 146,
 'lte': 122,
 '유독': 19,
 '저하': 16,
 '버튼': 233,
 '단자': 72,
 '자국': 19,
 '현상': 172,
 'lte2': 13,
 '재미': 59,
 '패턴': 37,
 '팬택': 31,
 '걸리': 134,
 '어서': 21,
 '젠더': 129,
 '내일': 257,
 'g4': 775,
 '내부': 70,
 '방금': 79,
 '있는곳': 10,
 '차액': 17,
 '좋긴한데': 11,
 '우리나라': 19,
 '이전': 198,
 '케이블': 117,
 '만료': 16,
 '루머': 45,
 '이정': 303,
 '그나': 192,
 '저작권자': 10,
 '안봐': 11,
 '42mm': 18,
 '보조': 27,
 '후기': 376,
 '롯데': 17,
 'ppl': 21,
 '직장인': 10,
 '의도': 9,
 '보자': 32,
 '시키': 48,
 '사용성': 9,
 '비디오': 22,
 '아쉽': 335,
 '구간': 9,
 '적절': 14,
 '도료': 16,
 '사용하다': 46,
 '못하': 468,
 '여러모': 27,
 '줄이': 35,
 '한번더': 13,
 '상승': 34,
 '이후': 230,
 '글쎄': 13,
 '무이자': 27,
 '마이크로': 34,
 '싼거': 19,
 '크롬': 133,
 '행보': 9,
 '깔리': 9,
 '한손': 68,
 '버벅거': 20,
 '후로': 11,
 '금방': 41,
 '결과물': 29,
 'encode': 10,
 '버스': 35,
 '진입': 31,
 'f700s': 25,
 '칭찬': 38,
 '신기': 120,
 '타입': 46,
 '사운드': 111,
 '첨부': 50,
 '떠오르': 17,
 '부럽': 22,
 '난감': 17,
 '수치': 18,
 '대응': 18,
 '처분': 23,
 'os': 43,
 '발전': 57,
 'hdmi': 24,
 '반영': 18,
 '유사': 11,
 '못봐': 9,
 '고장나': 17,
 '카드사': 19,
 '카페': 45,
 '하라': 24,
 '5x': 22,
 '오랜만': 88,
 '이런거': 57,
 '갈수': 15,
 '레노버': 55,
 '밧데리': 55,
 '특유': 19,
 '난다': 14,
 '운전': 16,
 '일반': 271,
 '갤칠': 34,
 'g7x': 15,
 '말로': 24,
 '사라지': 34,
 '태블릿': 21,
 '여행': 55,
 '뜨겁': 39,
 '편리': 57,
 '죽이': 12,
 '아이폰7': 36,
 '있다': 357,
 '이틀': 49,
 '대상': 46,
 '누나': 18,
 '별차이': 22,
 'compulsory': 81,
 '더이': 27,
 '버리': 109,
 '비슷': 295,
 '끄적': 10,
 '위주': 51,
 '알람': 21,
 '소음': 16,
 '내장': 147,
 '여친': 19,
 '넣어주': 12,
 '선택': 585,
 '사고': 144,
 '참여': 37,
 '지포': 24,
 '소개': 70,
 '상담': 42,
 '필요': 317,
 '다음주': 47,
 '엄마': 25,
 '아이폰se': 49,
 '침수': 25,
 '불구': 43,
 '떨구': 29,
 '음향': 34,
 '사용기간': 9,
 'ㅋㅌㅂㅇ': 142,
 '애초': 38,
 '5v': 13,
 '편하': 210,
 '오른쪽': 110,
 '화소': 64,
 '이득': 36,
 '짜증나': 31,
 'newsid': 12,
 '보급형': 65,
 '라오': 10,
 '착각': 11,
 '최적화': 90,
 '공기계': 98,
 '대구': 26,
 '브랜드': 93,
 '내용': 129,
 '디지털': 21,
 '들르': 21,
 '캡쳐': 35,
 '홈피': 9,
 '같습': 63,
 '티탄색상': 18,
 '자세': 115,
 '요거': 9,
 '르그번': 15,
 '뷰2': 19,
 '출력': 70,
 '53mm': 24,
 '기쁘': 10,
 '이면': 22,
 '예의': 25,
 '다녀오': 10,
 '고질적': 10,
 '네비': 14,
 '마시멜로': 77,
 '메인보드': 114,
 '꽂히': 26,
 '차량용': 13,
 '촬영': 213,
 '뽑기': 85,
 '물고': 10,
 '패드': 28,
 '감싸': 11,
 '실행': 78,
 '그부분': 14,
 '센스': 11,
 '체험존': 40,
 '잘몰': 11,
 '욕먹': 18,
 '기억': 83,
 '호환': 112,
 '놔두': 11,
 '전원': 229,
 '삼성꺼': 24,
 '설정': 203,
 '벌어지': 18,
 '아이템': 14,
 '스크래치': 22,
 'h61m': 15,
 '알루미늄': 34,
 '올립': 69,
 '연락처': 9,
 '글씨': 11,
 '빠릿빠릿': 21,
 '마감도': 9,
 '비해': 184,
 '인하': 23,
 '사양': 71,
 '장사': 18,
 '검정': 9,
 '벚꽃': 20,
 '구리': 17,
 '탈부착': 29,
 '어제': 420,
 '쓰긴': 11,
 '신분증': 24,
 '파일': 84,
 '베이스': 21,
 '마치': 65,
 '사신': 33,
 'aod': 24,
 'g5광각': 10,
 '신작': 9,
 '후면': 283,
 '대해': 106,
 '영상': 186,
 '개발': 63,
 '노트4s': 26,
 '교체식': 10,
 '생각': 1984,
 '반응': 97,
 '개통': 684,
 '초성': 22,
 '비하': 31,
 '고장': 90,
 '누르': 237,
 '지4': 15,
 '임대': 17,
 '접어': 11,
 '대만족': 35,
 '미리': 70,
 '단차': 447,
 '코어': 12,
 '보단': 21,
 '전산': 14,
 '단말기': 89,
 'g5사': 51,
 '밸런스': 14,
 '추가지원': 18,
 '파악': 14,
 '지역': 36,
 '안나': 133,
 '베스트샵': 77,
 '확장성': 13,
 '놀라': 33,
 '차량': 23,
 '기타': 61,
 'pro2': 19,
 '주로': 87,
 '와이파이': 134,
 '이동': 252,
 '신호': 13,
 '개인': 76,
 '더럽': 12,
 '벗겨지': 12,
 '강하': 34,
 '주실': 9,
 'the': 414,
 '전화기': 21,
 '4s': 13,
 '아이디': 18,
 'wb': 94,
 '오늘자': 13,
 'mwc': 22,
 '다이얼': 15,
 'youtube': 76,
 '의심': 36,
 '방통': 17,
 '추억': 10,
 '중복': 21,
 '최저': 25,
 '제발': 74,
 '도착': 97,
 '단지': 27,
 '플립': 11,
 '였습니': 20,
 '달고': 42,
 '세트': 18,
 '착탈식': 40,
 '휴대용': 12,
 '두가': 41,
 '나타': 19,
 '벤치마크': 19,
 '번갈': 16,
 '아노다이징': 17,
 '이기': 19,
 '편안': 24,
 '디바이스': 43,
 '끼울': 14,
 '계열': 10,
 '완벽': 71,
 '획기적': 11,
 's7엣지': 115,
 '후에': 67,
 '왜곡': 138,
 '중고': 261,
 '묶이': 14,
 '본문': 9,
 '감탄': 15,
 '띄우': 29,
 '시각': 13,
 '책상': 9,
 '만듦새': 12,
 '좌표': 184,
 '떨어트': 25,
 '신선': 25,
 '한창': 10,
 '밀어': 12,
 '모아': 19,
 '신품': 17,
 '하진': 9,
 '64g': 35,
 '지문': 253,
 '날짜': 31,
 '미루': 15,
 '측면': 72,
 '함정': 31,
 '형광등': 10,
 '단독': 11,
 '해석': 15,
 '차별화': 13,
 '입히': 22,
 '청구': 100,
 '그날': 11,
 '타임': 34,
 '쓸모': 16,
 '보도': 11,
 '기술적': 17,
 '가루': 9,
 '가정': 30,
 '힘들': 229,
 '필요도': 10,
 '군요': 12,
 '인상적': 16,
 '뉴스': 32,
 '정도': 930,
 '집사람': 15,
 '베터리': 130,
 '당연': 136,
 '표현': 54,
 '절연': 21,
 'on': 96,
 '넥서스': 51,
 '각설': 12,
 '통해': 46,
 '마무리': 18,
 '못쓰': 62,
 '자신': 24,
 '흔적': 11,
 '물론': 32,
 '계속': 59,
 '바래': 22,
 '일도': 11,
 'g5쓰': 9,
 '항목': 10,
 '그게': 98,
 '편입': 14,
 '음량': 22,
 '80l000alus': 14,
 '돌고': 11,
 '여태': 43,
 '터치': 172,
 '후반': 20,
 '종종': 19,
 'g580': 35,
 '단어': 20,
 '훌륭': 41,
 '갤럭키': 142,
 '당시': 57,
 '신용카드': 15,
 '여름': 31,
 '인생': 10,
 '주변부': 15,
 '16gb': 11,
 'good': 30,
 '대략': 87,
 '한명': 15,
 '다양': 112,
 '구라베젤': 31,
 '효율': 52,
 '5s': 42,
 '저조': 81,
 'ㄷㄷㄷㄷㄷ': 14,
 '비닐': 16,
 '59요금제': 80,
 '멈추': 42,
 '검수': 13,
 '들림': 22,
 '늘리': 14,
 '유격때문': 10,
 '메세지': 17,
 '몰레': 192,
 '판매자': 62,
 '기본기': 17,
 '뒷부분': 11,
 '두시': 11,
 '기울': 23,
 '저번주': 27,
 '심해': 48,
 '내방': 63,
 '밴드': 246,
 '타격': 18,
 '제일': 206,
 '이벤트': 299,
 '히트': 22,
 '아마존': 21,
 '어떤': 115,
 '여행가': 16,
 '쾌적': 31,
 '기계': 140,
 '지금까지': 105,
 '고속': 90,
 '자꾸': 140,
 '언제쯤': 28,
 '그동안': 68,
 '전후': 15,
 '상품': 67,
 '여유': 22,
 '착탈': 23,
 '판단': 54,
 '일어나': 16,
 'ㅅㅋㄱㅂ': 138,
 '시세가': 13,
 '반대편': 16,
 '올려놓': 13,
 '사람인': 9,
 '하니': 183,
 '무시': 27,
 '월드': 13,
 '대충': 111,
 '재생': 101,
 '시리즈': 117,
 '미지원': 17,
 '업자': 31,
 '못가': 15,
 'by': 37,
 '대용량': 18,
 '안계': 9,
 '전에': 45,
 '망해': 10,
 '둘째': 14,
 '화요일': 16,
 '말도': 23,
 '계신': 147,
 '버전': 79,
 '깜빡': 16,
 's6엣지': 14,
 '정지': 14,
 '요금': 733,
 '사용환경': 11,
 '공감': 12,
 '아쉬운점': 19,
 '바깥': 9,
 '이동하': 18,
 '무언': 19,
 '안녕': 236,
 '분할': 12,
 '좋긴': 23,
 '신박': 14,
 '이만': 11,
 '알지': 18,
 '짜증': 39,
 '한해': 12,
 '동그라미': 12,
 'lg꺼': 10,
 '일요일': 19,
 '분이': 77,
 '용도': 51,
 '순정': 29,
 '사용': 2178,
 '분야': 10,
 '자전거': 20,
 '나오는거': 20,
 '유격': 1077,
 '오프라인': 38,
 '중심': 22,
 'i3': 41,
 '구매가': 23,
 ...}
  • TF-IDF vectorizer

In [9]:
# check vocabulary in TDM
print(len(tfidf_corpus.vocabulary_))
print(len(tfidf_corpus.get_feature_names()))
tfidf_vectorizer.get_feature_names()[1000:1010]


3578
3578
Out[9]:
['대폭', '대하', '대학생', '대한민국', '대해', '대형', '대화면', '댓글', '더럽', '더불']

In [10]:
# frequency count
tfidf_word_sum = tfidf_bow.toarray().sum(axis=0)
tfidf_word_name = tfidf_corpus.get_feature_names()
tfidf_word_dict = {}

for i in range(len(tfidf_word_sum)):
    tfidf_word_dict[tfidf_word_name[i]] = tfidf_word_sum[i]
    
print(tfidf_corpus.get_feature_names()[150:160])
print(tfidf_word_sum[150:160])


['g5쪽', 'g5출시', 'g5카메', 'g5하', 'g5후', 'g6', 'g7', 'g7x', 'galaxy', 'gk']
[  1.9353321    4.90042073   3.53566612   2.89852336   2.48341187
  11.61954354   2.57126653   2.2969142    7.82942507   2.74237528]

In [11]:
# word & index number
tfidf_word_dict


Out[11]:
{'기록': 3.059014735227978,
 '계약서': 4.0762238074865245,
 '마시': 2.9296344494642166,
 '그래픽카드': 7.4830303723052323,
 '고민': 102.47237899899717,
 '괜찮다': 4.5461563979857695,
 '나을': 15.346104063990092,
 '옮기': 17.092562774668181,
 '안타깝': 6.0284246014505154,
 '선택지': 4.8311254278910321,
 '광주': 5.056074283813941,
 '철회': 4.129889748313043,
 '문의': 24.092461171219675,
 '노트북': 17.100678766342558,
 '화이트': 10.765715374793414,
 '유플러스': 19.486255689020755,
 '일하': 4.0667203292007956,
 '기본': 46.679144353133104,
 '구매자': 9.6881068522525364,
 '하이마트': 26.667847797146177,
 '홍보': 7.1818885912708899,
 'ㅌㅋㄴ': 6.6258725718419837,
 '64gb': 4.2459870638483741,
 'ls2d': 6.6553979743928222,
 '경기': 3.1890090523074286,
 '위치': 11.445960340792681,
 '고정이': 2.1158070432499416,
 '먼지': 8.8844900155381872,
 '메뉴': 6.1647463294687954,
 '국내': 25.506655312012178,
 '편한': 5.2286975301955447,
 '평소': 10.478068323474037,
 '싼맛': 1.7639435792524509,
 '손가락': 6.2607914808829337,
 '날씨': 12.575086311364982,
 '입체감': 1.3689955551571156,
 '화웨이': 6.5666922590602104,
 '붙이': 14.427491654970744,
 '효도': 2.3592703402753004,
 '시계': 5.432995293830591,
 '방진': 2.3526061393490409,
 '분리형': 3.3640729137467362,
 '만나': 7.3044262211203321,
 'a7': 10.97119226959974,
 '좋을거': 4.138639515872736,
 '클리앙': 1.6604967304434213,
 '인터파크': 4.0736909979458149,
 '찍기': 2.7722655778636534,
 '답답': 12.088134840725621,
 '매달': 7.292020273741513,
 '무겁': 5.5223640826357849,
 '퀵커버': 2.3533250022721428,
 '전국': 2.0452418369500038,
 '의향': 1.3658346647091231,
 'mode': 15.129171557751288,
 '정리': 10.410856718465597,
 '국민': 3.3550456986007418,
 '일주일': 15.628509247374488,
 '특별': 5.3109853183621389,
 '묻히': 1.9440075174822016,
 '웹서핑': 12.613364915892967,
 '튼튼': 4.8099312120365258,
 '점이': 6.2351153379022781,
 'dslr': 5.0768768875500694,
 '가죽': 4.3527763707368283,
 '대부분': 15.856878258692657,
 '높다': 2.2402374797526634,
 '작업': 5.296182583293044,
 '예약': 26.358424331707003,
 '진심': 2.7084638903556089,
 '사과': 5.738055352628292,
 '칩셋': 1.3101767921621585,
 'cat6': 4.0643718427905364,
 '택배': 24.434754515441231,
 '어둡': 16.086038977647465,
 '노트5': 35.697724202905604,
 '이번': 84.202277322137846,
 '웨이즈': 8.8986381525346943,
 '저번': 6.4505360460927825,
 '특성': 1.9680512346178081,
 '지난번': 2.3914644125504112,
 '파손': 5.2312381450277261,
 '모듈빼': 1.9855136500696458,
 '채우': 6.1620886317620984,
 'kt': 63.043686455350382,
 '단말': 3.3530491478246973,
 '사업부': 2.6493502423743562,
 '안되는거': 4.6859282516575753,
 '게다': 7.6929127061870188,
 '구해': 6.4224290226414942,
 '조립': 3.8997316730024276,
 '장기적': 1.8785343530519556,
 '거리': 9.6646162683667534,
 '탐나': 5.8791890061248129,
 'ㅂㅇ': 20.241717745595089,
 '내야': 4.6512649659288963,
 '달라': 14.269844698052903,
 '가입': 19.52528898643051,
 '판매': 22.446515501285798,
 '해상도': 12.085334465001171,
 '되지': 11.705017838827917,
 '통하': 8.8342980102734465,
 '가기': 7.661232171074456,
 '이럴': 2.0559414311596047,
 '판정': 4.4303538168013059,
 '반응속도': 2.195279803076656,
 '저가형': 3.4106132731493992,
 '정상적': 4.845070271566577,
 '유투브': 6.7247899589042106,
 '적응': 12.247960201409175,
 '절약': 1.6525809510120983,
 'ㄷㄷ': 24.042846161248338,
 '학생': 2.3715838729400605,
 '인봉': 7.6781111005360065,
 '광탈': 8.2732115791158414,
 '활성화': 4.2297387379128164,
 '은거': 2.6343312270064452,
 '스마트워치': 2.0396082022498185,
 '이부': 4.14363858723768,
 '안주': 2.9500409018972347,
 '스냅': 11.727092387170662,
 '길이': 6.3785474650822733,
 '조사': 2.4707506565298409,
 '마구': 1.9315958276384104,
 '인해': 7.7374005271496973,
 'techholic': 0.45500840529409387,
 '다시': 62.368378974545621,
 '싸구려': 2.1132147962552197,
 'ㅅㄷㄹ가': 3.6182793518877432,
 '감기': 2.1363790044877597,
 '대기': 9.7336177943400788,
 '하도': 6.7622284520633604,
 '하필': 2.1161258146394619,
 '가격대': 11.969139229260124,
 '단통': 2.056116918173204,
 '빠르': 24.610393207834363,
 '없애': 9.25012771734289,
 '신경': 16.727798336418417,
 '귀찮': 19.445123786029132,
 '절대': 10.912932702662554,
 '자극': 1.4951740727778311,
 '이르': 9.0376098530492275,
 '편차': 1.5179976015106686,
 '사실상': 7.6569268459121149,
 '작동': 10.53866685383548,
 '바뀌': 16.057759889776182,
 '교품증': 26.925552127779131,
 '한마디': 2.1996089533510967,
 '후속': 4.9901135545594091,
 '취소': 10.815306250411934,
 '아그': 1.7444759026824372,
 '제이슨': 3.6959694423904841,
 '다운': 10.826041057338221,
 '생김': 2.0540486306694596,
 '수채화': 5.33281812109509,
 '중간중간': 2.3564377117181001,
 '인상': 1.996154407532795,
 '받으': 1.848583484342212,
 '갤수육': 3.6130342710019492,
 '내년': 4.6997852944708383,
 '운영': 2.50863043767542,
 'audio': 1.6382541093688254,
 '정착': 1.7589886971405753,
 '모듈': 217.49001232419968,
 '처리': 7.3844043450368018,
 '개취': 5.4656653048316022,
 '확대': 4.8728240605925812,
 '점유율': 4.1920492085250247,
 '어머니': 17.307428494960032,
 '현존': 2.3361778907093957,
 '물리': 8.4185299957335005,
 '서류': 3.7982830896194431,
 '방출': 2.5095244212561485,
 '못생기': 3.106950299138179,
 '세로': 3.625127008137246,
 '현명': 2.4844822793658974,
 '겨울': 1.8077310615527276,
 '각종': 4.1194577356536533,
 '땡기': 9.5034462340998136,
 '공간': 3.4171809544529723,
 '총평': 1.3082871647636123,
 '인간': 1.2948791398607591,
 '해주시': 5.5675150605181285,
 '생길': 5.0117510854312268,
 '사무실': 6.0509832041422245,
 '규모': 1.5697234870288841,
 '오더': 1.6356714575438365,
 '설명': 12.324677320333798,
 '드릴': 4.6105538290674266,
 'sid2': 6.9609510213198131,
 '알아보다': 4.3092581109178507,
 '마감': 17.576829315081536,
 '발급': 5.212865068469835,
 '보다': 8.5249505565370356,
 '사은품': 30.495346821366397,
 '장소': 2.7729265830823038,
 '다음날': 3.9539775121939917,
 '빠르다': 1.7565283280040924,
 '심지': 6.0507970290133484,
 '실수': 7.50941959608181,
 '그럼': 2.7814992884361782,
 '발표': 14.819895712705144,
 '번이': 53.614699049721338,
 '옵션': 5.9805756925181868,
 '울산': 3.5604469259914491,
 '남지': 1.8619178825380287,
 '하단부': 8.2619275008870829,
 '광각': 72.577668205874346,
 '하나하나': 1.9707121379664148,
 '아저씨': 2.3522318134518421,
 '하이': 2.0154246600262904,
 '각도': 2.8059071761997378,
 '보스': 1.9707738268894814,
 '매년': 1.752124328372167,
 '갤6': 18.660673396416637,
 '스타': 3.2591429388288558,
 '어떠': 11.909416883044065,
 '안정감': 1.7636348279340597,
 '망작': 1.7380766646197952,
 '빠릿': 3.7410510517091731,
 '펌웨어': 3.9187736091902989,
 '일반인': 5.5565703028519877,
 '꺼지': 10.413155682730824,
 '불가능': 8.5205571697127702,
 '쓰지': 5.560748439255808,
 '아침': 11.860072366958502,
 '주지': 3.4125934096580823,
 '탑재': 10.558083417923113,
 '물품': 2.1487920816398716,
 '오늘': 118.85004904096492,
 '전화로': 1.7971742259529688,
 '부2': 8.9135793583892529,
 '하지않': 1.7633707094409186,
 '먼저': 12.364619790387753,
 '프렌즈': 14.885212210886445,
 '내면': 5.4380537661287862,
 '근래': 2.7437522824671006,
 '주면': 3.099168542554172,
 '넘사벽': 3.3915724337714037,
 'ㄹㄱㅂㅇ': 25.485513942197315,
 '인정': 5.0524167875603272,
 '과장': 1.8739194839139006,
 '마마무': 4.5925304757061474,
 '산지': 6.1697936409971508,
 '동생': 9.8421488557346422,
 '감감무소식': 2.1117391447782818,
 '양품': 34.073231170280245,
 '변경': 20.551512963409532,
 '하시': 25.12844504329135,
 '하이브리드': 2.3450168849674782,
 'a4용지': 2.9599309877623101,
 '국민카드': 4.5872520283019291,
 'edge': 5.895874296503389,
 '케이스': 80.554470532013411,
 '백그라운드': 2.6464112078183937,
 '이것이': 1.7767137910318078,
 '사놓': 2.6939722424157808,
 '등에': 2.1094351622311578,
 '현실': 7.7603935384872536,
 '프로세서': 2.1681882341584138,
 'g540': 3.3990624157568785,
 '각각': 4.202208379112836,
 'qc': 11.311716411781475,
 '중요시': 2.1448651639581744,
 'galaxy': 7.8294250688893268,
 '그정': 4.9694272795802608,
 '6in': 2.7872005380767932,
 'htc': 5.6484110714511937,
 '고정': 7.3624367271707136,
 '유리': 8.3530998389982241,
 '채택': 4.4885372425835266,
 '목적': 3.9158570533990473,
 '롤링': 4.6219246880283942,
 '삼성': 87.32090518082903,
 'ㅅㅋ': 15.444568758899699,
 '스크린': 5.6999344117973747,
 '욕심': 4.3939917308501348,
 '그림': 2.7712591185266544,
 '덕분': 7.2702313050249963,
 '중요': 13.174137219412573,
 '초중반': 2.5505384376299043,
 '밤에': 5.4611365790196107,
 '지속적': 1.3688469021360727,
 '아내': 2.3008189162606332,
 '9ghz': 2.2476808154259982,
 '신경안쓰': 2.7768593106373696,
 '메모리': 11.864199452648382,
 '모델명': 2.6508505445778674,
 '노이즈': 8.243884175621961,
 '보다보': 6.0515852651238049,
 '인수': 2.4527583818264671,
 '고르': 10.596824726485844,
 '나오길': 3.8786914974221154,
 '남자': 8.9464811500125379,
 '물에': 2.7295098264224031,
 '아식스': 3.4038388563469764,
 '글쓰': 4.271713249084601,
 '고집': 3.1867634711598836,
 '반해': 1.8570584141507782,
 '이젠': 8.770766557681382,
 '방식': 14.473926214967964,
 '사정': 1.670850710749568,
 '은근': 7.1576480504665376,
 '단통법': 19.196687454956887,
 '보여주': 8.0223167650298759,
 '외관': 14.24540743236542,
 '오고': 4.4060994739689985,
 '저장': 6.6155287420546962,
 '위하': 15.628175184183826,
 '연락': 16.437649714644714,
 '지우': 3.6611491368007028,
 '때까지': 1.3910457246632868,
 '아가': 2.6369616350641296,
 '대화면': 1.9590814778927006,
 '신경쓰': 11.713518386072931,
 '커지': 2.9758692081354381,
 '서비스': 15.278969899220487,
 '용어': 3.9121775304621198,
 '얼른': 6.5995656070675963,
 '감사': 18.668453726497539,
 '차후': 3.2839846755920523,
 '기업': 6.4515312859338092,
 '2년정': 2.07985234659836,
 '오해': 2.5589205563415889,
 '상상': 4.5098141877009388,
 '부무': 11.213921205648717,
 '완납': 9.9755919717255779,
 '서랍': 1.9599587452586675,
 '업무': 3.6750114032942642,
 '해외': 15.066721022123183,
 '올라가': 4.303729735738302,
 '현완': 14.915754438622583,
 '귀가': 3.0144156117923306,
 '치명적': 5.3613657114527324,
 '이거': 59.871076015117779,
 '128gb': 1.5526004118952574,
 '하반기': 3.2933166786964234,
 '상태': 27.482183618173764,
 '안정성': 2.0716494268013346,
 '커뮤니티': 3.0982489925985339,
 '엄청': 45.087365676410563,
 '뒤지': 2.6295998355465029,
 '현실적': 1.403352738026999,
 '정품': 9.125151319103292,
 '갤s7': 17.1263971225879,
 'url': 2.2908471949412963,
 '선보': 2.7014289678498464,
 'sd': 13.22121583862824,
 '보류': 2.0134242113042511,
 '일일이': 1.6788668679477432,
 '114': 2.2489271596350617,
 '인터페이스': 2.1877757696010187,
 '마케팅': 17.712050053656728,
 '이어폰': 58.730097010610727,
 '분위기': 9.7986351266857472,
 '정확': 12.979621667828395,
 '맨날': 3.8774642613101329,
 '저항': 2.6959903670863148,
 '유지': 26.455371730305139,
 '원활': 2.4067119294039365,
 '팍팍': 2.7604613095114781,
 '다가오': 3.4528821025130014,
 '서비스센터': 24.548211972935345,
 'be': 5.388858368427532,
 '떨어지': 18.210391784968483,
 '착한': 2.5023498460849014,
 '하루': 19.318029614472596,
 '해결방법': 2.5139469931465368,
 '트렌드': 2.0768476357331065,
 '기왕': 1.871493742947866,
 '극복': 1.8116715801756638,
 '네트워크': 1.6482458682085441,
 'vga': 2.3001823733602289,
 '리모콘': 2.2015792664162785,
 '현대': 2.930225646176885,
 '단점': 22.866477873724982,
 '폰값': 2.3309296998486824,
 '오류': 9.3165497856506576,
 '난리': 5.7838899490198799,
 '등록': 7.2258667036878279,
 '계기': 1.8091654977152987,
 '개인적': 34.832867527669215,
 'naver': 25.232449107005369,
 'for': 3.623037907262761,
 '5se': 2.9961487719469515,
 'laptop': 4.2299158988759213,
 '중앙': 2.0394096412798581,
 '자체': 22.683960882631549,
 '연결': 37.188992478583366,
 '여론': 1.7813613487268656,
 '스토어': 1.4633626602071579,
 '중국': 12.218633253798316,
 '앰프': 2.0188167603068092,
 '유선': 4.6375889950475591,
 '눈팅': 12.497774781165853,
 'electronics': 5.9088553904621888,
 '완전체': 1.6781593831226143,
 '어짜피': 5.8054650881899459,
 '구경': 10.407416191772464,
 '방탄': 2.2385613460144604,
 '베샵': 6.2372694709346028,
 '인거': 4.8143184340887641,
 '귀엽': 4.1987043976935148,
 '카드결제': 4.0271065327153623,
 '하나': 39.334851043532112,
 '적지': 1.8752199948368542,
 '기사': 8.5237784826353664,
 '포함': 14.08536911861944,
 '건너': 2.1201580748354263,
 '정보공유': 2.5365324309919992,
 '구글': 10.362583339004102,
 '어려': 3.4565687210701102,
 '해당': 9.5433208476167852,
 'asrock': 1.9279887694599425,
 '잠금': 5.3635489094234057,
 '나타내': 4.3966521890352093,
 '데이트': 19.650724787419701,
 '열흘': 3.0146379380613637,
 '지프': 7.7693558024360421,
 '대세': 5.4955872257658038,
 '나르': 28.123380703084585,
 '허허': 4.9848275527299961,
 '노트3': 12.918958635532704,
 '빠릿하': 2.3669622177006082,
 '겔럭시': 2.5740719507629239,
 '플레이': 7.1249312500380331,
 'sw': 1.4263747455165994,
 '발생': 16.351719233426639,
 '지네': 2.1387429392608457,
 '미디어': 3.6184529297066366,
 '지점': 2.812094678380086,
 '마트': 2.8340847762792163,
 '꺼리': 1.5633813206638139,
 '상단': 14.160289100818517,
 '돌아가': 7.4089783487610097,
 '좋다': 15.302225192610656,
 '게임': 43.665520561860255,
 '생기': 26.281988617901717,
 '분리': 17.178694236962755,
 '카드': 26.344464773141439,
 '빠지': 13.587959383043684,
 '쌓이': 1.8310721133528163,
 '예상': 22.148152877352249,
 '성향': 2.1356363299605516,
 '땡겨': 1.3382667635281975,
 '개철': 10.642740857469159,
 '제시': 1.4558956549444431,
 '샀다': 3.9993765953029161,
 '능력': 1.3994032528835449,
 'aspx': 3.1134217500744952,
 '한데': 19.553885720457274,
 '희망': 3.0797441237696503,
 '그때': 7.1325732368754418,
 '강력': 2.3017433986866793,
 '듣기': 2.6635890485202234,
 '반대쪽': 2.712918902567782,
 '강제': 3.9428462342604051,
 '아마': 4.5374480000541872,
 '허접': 2.629831499695634,
 'lcd': 7.5663073672847112,
 '있을까': 29.037860947922713,
 '일정': 6.2787122400797379,
 '유일': 3.0675856858779302,
 '간지': 1.9234558679925935,
 'gpro2': 3.8274038809864379,
 '롤리팝': 4.1330677239791118,
 '주변기기': 5.6738856457221001,
 '볼땐': 2.5727646352419349,
 '수요': 2.5653428777606826,
 '로지텍': 3.4819740974786253,
 '구합': 2.3786801434963802,
 '가족': 12.069595698744816,
 '와중': 3.3403735649386852,
 '어떨까': 4.7403787412924183,
 '원가': 3.094316556850043,
 '흥미': 3.5128640416788204,
 '디자인과': 2.9234519565510326,
 '순위': 3.9686780063954514,
 '결론적': 2.9020086771837508,
 '이상은': 3.6044494547806152,
 '주변': 10.734787437469324,
 'of': 2.4801377692488673,
 '검색': 25.839799149721664,
 '599요금제': 7.7909246208232394,
 '한시': 3.5099318896157499,
 '간단': 14.113725299492579,
 '쥐5': 5.2753195776744404,
 '확정': 4.3733972514905171,
 '어이': 1.6402439688946335,
 '더하': 5.6482108797892723,
 '직영점': 8.0208852009829119,
 '부정적': 2.1539148583172074,
 '공짜': 11.341236303446486,
 '걸치': 0.71404938435427023,
 '고치': 4.2463400956706625,
 '기회': 6.2299791542631882,
 '측정': 9.0655189254327215,
 '버벅': 8.4291125722828397,
 '장착': 21.028297489550614,
 'samsung': 3.0204791180269996,
 '스냅드래곤': 4.7684565525076588,
 '아니': 110.0693138672469,
 '버젼': 2.3819773143310821,
 '성공': 14.342890189520325,
 '설레': 3.4406238611417739,
 '한쪽': 8.9837852980143786,
 '남기': 9.9462581597617206,
 '잡음': 3.5974391875849756,
 '한번': 18.625456844276162,
 '이제': 55.547072694354739,
 '정책': 26.91757231937639,
 'lte': 17.121175449704868,
 '유독': 3.4987717913403817,
 '저하': 2.2473804275801434,
 '버튼': 24.943226680760876,
 '단자': 9.0521420350245041,
 '자국': 2.9378490467358183,
 '현상': 17.745610661649124,
 'lte2': 2.5937071055175895,
 '재미': 9.1792506580455431,
 '패턴': 4.4733592841561718,
 '팬택': 4.6138443350003806,
 '걸리': 19.57344125351268,
 '어서': 4.3129889995059694,
 '젠더': 15.115701106973678,
 '내일': 43.265778730726034,
 'g4': 82.505880415487866,
 '내부': 8.1899405683975104,
 '방금': 16.156972100468831,
 '있는곳': 2.174209600340983,
 '차액': 2.6884473022662436,
 '좋긴한데': 2.2187290301698379,
 '우리나라': 3.502273417518186,
 '이전': 19.832563099352583,
 '케이블': 14.80954135405211,
 '만료': 2.7896919514701248,
 '루머': 8.6366858907752508,
 '이정': 49.719913602511802,
 '그나': 26.647893687295216,
 '저작권자': 0.5440204469005413,
 '안봐': 2.67954067302166,
 '42mm': 2.1982643173520633,
 '보조': 4.801077055450433,
 '후기': 37.49660533176732,
 '롯데': 2.7415529242796777,
 'ppl': 6.6946347905039563,
 '직장인': 1.5492643589111252,
 '의도': 1.6508867267571528,
 '보자': 4.0862130069627201,
 '시키': 6.1583339583463399,
 '사용성': 1.8597163913235426,
 '비디오': 4.9534177744256809,
 '아쉽': 39.361604817265494,
 '구간': 0.88941008423171797,
 '적절': 2.230528817286928,
 '도료': 1.7951564676753096,
 '사용하다': 7.2083003719441772,
 '못하': 45.589848827674693,
 '여러모': 4.869349865721972,
 '줄이': 4.4528238216492113,
 '한번더': 2.2652186497703317,
 '상승': 5.4477988838357208,
 '이후': 26.295464410782092,
 '글쎄': 2.2779346602733721,
 '무이자': 4.5050267679108709,
 '마이크로': 3.6684604037595205,
 '싼거': 3.8928494033067098,
 '크롬': 20.705942806383412,
 '행보': 1.4626032271301583,
 '깔리': 1.8962920758618871,
 '한손': 8.3609549743799096,
 '버벅거': 3.9731931010595232,
 '후로': 2.3944222807663835,
 '금방': 7.099205017642654,
 '결과물': 4.326543802208846,
 'encode': 0.85853129842784903,
 '버스': 6.9962896143024604,
 '진입': 5.0076395558632507,
 'f700s': 3.7054906477065739,
 '칭찬': 5.0448301963369042,
 '신기': 17.771305836021977,
 '타입': 4.9672767385194296,
 '사운드': 10.116966910812145,
 '첨부': 7.4439399351498485,
 '떠오르': 3.5410377044452814,
 '부럽': 4.7741024718732152,
 '난감': 3.0971477820201261,
 '수치': 2.7562493612504455,
 '대응': 3.3612742884955278,
 '처분': 3.5110105297886509,
 'os': 6.0159620176861326,
 '발전': 7.3855983815260347,
 'hdmi': 2.7201245442546353,
 '반영': 2.6413705065326338,
 '유사': 1.4343415139562712,
 '못봐': 2.4371129253209811,
 '고장나': 3.4827905527800915,
 '카드사': 3.3937593205324226,
 '카페': 6.476097018915314,
 '하라': 3.8875779930031977,
 '5x': 3.9510671188252835,
 '오랜만': 17.376585231198291,
 '이런거': 9.3126657494644345,
 '갈수': 2.8738877064304917,
 '레노버': 10.348517748436345,
 '밧데리': 10.474885906453464,
 '특유': 2.81103873746074,
 '난다': 2.2794058178883176,
 '운전': 2.3142184894890661,
 '일반': 33.495859238356886,
 '갤칠': 8.2759788330628812,
 'g7x': 2.296914201843463,
 '말로': 3.7660585051281279,
 '사라지': 5.3976257095624698,
 '태블릿': 2.8804135052162931,
 '여행': 7.8158070493704441,
 '뜨겁': 6.8213535408783246,
 '편리': 5.4362216655470306,
 '죽이': 2.5807489189035056,
 '아이폰7': 6.4459073900500687,
 '있다': 32.3831816326759,
 '이틀': 7.6213795286156589,
 '대상': 4.8416271612497424,
 '누나': 3.5810969745351313,
 '별차이': 4.4078729960846035,
 'compulsory': 5.806457619627893,
 '더이': 5.1192238581422655,
 '버리': 14.854943138377159,
 '비슷': 37.455093506113101,
 '끄적': 1.5565437625253697,
 '위주': 6.9335207876983374,
 '알람': 2.9711872402669517,
 '소음': 1.2240980694965871,
 '내장': 21.253575526805943,
 '여친': 4.7777543380379646,
 '넣어주': 2.8641689611814503,
 '선택': 70.01610425620072,
 '사고': 23.972067960133025,
 '참여': 5.8906312065792203,
 '지포': 6.3637883052777484,
 '소개': 8.3818950860880754,
 '상담': 6.0487811950345236,
 '필요': 33.33285565725118,
 '다음주': 10.02461965287169,
 '엄마': 4.9453011872291848,
 '아이폰se': 9.3339407798118064,
 '침수': 5.4908096120166121,
 '불구': 4.8360574970980501,
 '떨구': 5.7977409368210928,
 '음향': 4.0751623144396385,
 '사용기간': 1.9198925034462617,
 'ㅋㅌㅂㅇ': 34.816449341024551,
 '애초': 6.0117847140293117,
 '5v': 2.2298017023148438,
 '편하': 22.116196233308937,
 '오른쪽': 15.20721192744629,
 '화소': 6.6832660603338852,
 '이득': 7.4927469641355771,
 '짜증나': 6.6755798903121928,
 'newsid': 3.0085053067107861,
 '보급형': 9.4368935836123828,
 '라오': 1.6780567077373709,
 '착각': 1.8212967014478723,
 '최적화': 12.730444770068656,
 '공기계': 18.802755213654699,
 '대구': 7.2096995800211481,
 '브랜드': 10.23056248757821,
 '내용': 14.111635929267539,
 '디지털': 2.1675946046223302,
 '들르': 2.8420770680940399,
 '캡쳐': 6.5390169760065673,
 '홈피': 2.4062679149634176,
 '같습': 12.050704536797271,
 '티탄색상': 3.5201597300669261,
 '자세': 15.603623341590296,
 '요거': 1.9235727930708011,
 '르그번': 4.6641755353002372,
 '뷰2': 3.2665756507872072,
 '출력': 7.1370515841051922,
 '53mm': 2.946401344242497,
 '기쁘': 1.8029735406326235,
 '이면': 4.9868942437794237,
 '예의': 4.6201783740401927,
 '다녀오': 2.6060016749928243,
 '고질적': 1.9108933700742607,
 '네비': 2.2944255856399498,
 '마시멜로': 10.460742526805646,
 '메인보드': 14.976979410634298,
 '꽂히': 4.4572744698943234,
 '차량용': 2.5261346689284125,
 '촬영': 19.824380535103082,
 '뽑기': 14.083872903591987,
 '물고': 1.9145155350498375,
 '패드': 4.0311819692219952,
 '감싸': 1.819519837565601,
 '실행': 9.2697153092449511,
 '그부분': 2.5658228940394832,
 '센스': 2.3106388358429286,
 '체험존': 8.9640270877510133,
 '잘몰': 2.4761405135753054,
 '욕먹': 4.0389758840941958,
 '기억': 10.575995431276887,
 '호환': 19.01106938776034,
 '놔두': 1.294352085756052,
 '전원': 31.216452475774474,
 '삼성꺼': 4.1083416277262366,
 '설정': 25.538463255583434,
 '벌어지': 3.1932340564201271,
 '아이템': 2.9801585603576344,
 '스크래치': 3.7996872996743853,
 'h61m': 2.7320591049642795,
 '알루미늄': 3.9914364733420182,
 '올립': 11.651200132189425,
 '연락처': 1.9853608261399334,
 '글씨': 1.7726709293603915,
 '빠릿빠릿': 3.2158829974424719,
 '마감도': 1.6807410925455628,
 '비해': 16.928773457789539,
 '인하': 5.6387025054765934,
 '사양': 10.382208802346682,
 '장사': 2.9618974133528084,
 '검정': 1.5336499051789547,
 '벚꽃': 4.7703428469136169,
 '구리': 4.3396540515434063,
 '탈부착': 4.5779899519165568,
 '어제': 55.905339397234094,
 '쓰긴': 2.237396945409277,
 '신분증': 4.3222857911357924,
 '파일': 10.847968052428863,
 '베이스': 1.8455876084002574,
 '마치': 7.2903707130194286,
 '사신': 8.1773559430042884,
 'aod': 3.8527647629207999,
 'g5광각': 3.9589046541100235,
 '신작': 2.112721730400525,
 '후면': 31.463162485722574,
 '대해': 11.22905772936274,
 '영상': 25.308691706330624,
 '개발': 5.8498119212583246,
 '노트4s': 5.7826149969823897,
 '교체식': 2.7124467606924796,
 '생각': 143.97184562519266,
 '반응': 14.896237706895947,
 '개통': 80.796171303944419,
 '초성': 4.1045683628646934,
 '비하': 4.2619040695073407,
 '고장': 14.950841657415626,
 '누르': 25.137317545666285,
 '지4': 2.7573363304261655,
 '임대': 3.3205475926634662,
 '접어': 1.8723946749338272,
 '대만족': 5.9466390910017584,
 '미리': 9.553042175187759,
 '단차': 57.030391042504007,
 '코어': 1.2038262444981747,
 '보단': 3.1723625349134617,
 '전산': 2.6153198198020644,
 '단말기': 11.966904188751599,
 'g5사': 14.610571069775299,
 '밸런스': 2.4465776608704397,
 '추가지원': 3.318374536366091,
 '파악': 2.4531692023185077,
 '지역': 6.0928660453851569,
 '안나': 19.920294708395609,
 '베스트샵': 15.160853771825904,
 '확장성': 3.0553768241193411,
 '놀라': 4.7958243583818296,
 '차량': 3.6889608103317797,
 '기타': 7.4966063996661907,
 'pro2': 3.2448118112157922,
 '주로': 10.289033952684781,
 '와이파이': 20.663545797888272,
 '이동': 47.271134522744987,
 '신호': 1.4324663890769764,
 '개인': 8.5473437802323211,
 '더럽': 2.8083358769980413,
 '벗겨지': 1.8127491736337689,
 '강하': 5.5531295304483876,
 '주실': 2.9023622112044705,
 'the': 10.571641780604871,
 '전화기': 3.2835194321502441,
 '4s': 1.7934637411015855,
 '아이디': 3.5321928816580694,
 'wb': 6.6851255005376764,
 '오늘자': 4.2015968746030463,
 'mwc': 4.8102502117181514,
 '다이얼': 1.8421217580579363,
 'youtube': 18.164886150384476,
 '의심': 5.599721323087242,
 '방통': 3.9035351884443146,
 '추억': 1.8382177244434708,
 '중복': 4.5113885282017767,
 '최저': 4.9188765389032545,
 '제발': 14.108314368883127,
 '도착': 16.986228848605712,
 '단지': 4.6185297051481271,
 '플립': 2.2276453540913987,
 '였습니': 2.278271215819387,
 '달고': 6.056523216438773,
 '세트': 2.4491891033152697,
 '착탈식': 7.0154899185396777,
 '휴대용': 2.2027868803308372,
 '두가': 7.1499111726224793,
 '나타': 2.1943120582863238,
 '벤치마크': 2.765281778057179,
 '번갈': 2.7421141358628147,
 '아노다이징': 2.8616272547222747,
 '이기': 4.4580772659544188,
 '편안': 2.6313606482728775,
 '디바이스': 2.6563455674638736,
 '끼울': 2.290347489714839,
 '계열': 2.023929569681246,
 '완벽': 9.4136465386519568,
 '획기적': 2.6408663063632365,
 's7엣지': 18.256155156762151,
 '후에': 9.5957486483753218,
 '왜곡': 15.487284976996845,
 '중고': 33.829387902539118,
 '묶이': 2.9117845644378888,
 '본문': 1.6622988764010627,
 '감탄': 2.1497125965982171,
 '띄우': 4.2598884510693189,
 '시각': 3.3886360281092944,
 '책상': 1.8016791412060349,
 '만듦새': 1.4475593137419567,
 '좌표': 40.26925466988407,
 '떨어트': 5.3266457192336336,
 '신선': 5.0990407430590814,
 '한창': 1.5051411550329443,
 '밀어': 1.8479146615731397,
 '모아': 2.8738449287176611,
 '신품': 3.1626246021784015,
 '하진': 1.8121881169786951,
 '64g': 7.7399772882727325,
 '지문': 27.588574725175459,
 '날짜': 5.099569490806025,
 '미루': 3.0936680688488272,
 '측면': 7.7173131398906545,
 '함정': 6.9084287479389523,
 '형광등': 1.9451778451965023,
 '단독': 1.9754244267499601,
 '해석': 3.7159758031248855,
 '차별화': 2.2345395031851591,
 '입히': 3.0321614327082309,
 '청구': 14.378880849158325,
 '그날': 2.5697560235591448,
 '타임': 6.2390255967601354,
 '쓸모': 3.2225798912676611,
 '보도': 2.4735608997559,
 '기술적': 2.8520936724804002,
 '가루': 2.2703372301918687,
 '가정': 4.2413232761107071,
 '힘들': 32.595497097644262,
 '필요도': 1.1560113450375979,
 '군요': 1.8797602658645938,
 '인상적': 2.3794338295571054,
 '뉴스': 5.7030128839438197,
 '정도': 77.797971170013028,
 '집사람': 3.003722188920229,
 '베터리': 19.760519319223597,
 '당연': 18.489301168464412,
 '표현': 6.3628649036396441,
 '절연': 2.8285091725827898,
 'on': 7.2784452136919251,
 '넥서스': 7.2584771787623152,
 '각설': 1.8409439266488459,
 '통해': 4.8510071004465685,
 '마무리': 2.7832330001940813,
 '못쓰': 10.034049718092961,
 '자신': 2.3393589830070214,
 '흔적': 2.6875599813501849,
 '물론': 4.4141390239170448,
 '계속': 8.9603071759190076,
 '바래': 4.3036849010769265,
 '일도': 1.5834183454481803,
 'g5쓰': 2.0342803642713414,
 '항목': 1.8582113779790361,
 '그게': 12.419967146999777,
 '편입': 1.5316852096337512,
 '음량': 3.8263343334866899,
 '80l000alus': 3.212475482880687,
 '돌고': 2.2401097105969301,
 '여태': 6.9329850467676728,
 '터치': 20.804384560432506,
 '후반': 4.5600279922471252,
 '종종': 2.978359123983926,
 'g580': 6.7855985572336008,
 '단어': 3.7247529190460509,
 '훌륭': 5.708541570294221,
 '갤럭키': 27.467052536817587,
 '당시': 6.8564076523901623,
 '신용카드': 2.7675040092607959,
 '여름': 4.4096716993363581,
 '인생': 1.4306957404948222,
 '주변부': 1.3828590241888417,
 '16gb': 1.519817887750057,
 'good': 2.1605577362378874,
 '대략': 13.702601283776149,
 '한명': 3.0037045215786056,
 '다양': 10.28237745745116,
 '구라베젤': 8.3418454479577804,
 '효율': 5.7694142341193251,
 '5s': 6.0425999936922521,
 '저조': 11.695479344280155,
 'ㄷㄷㄷㄷㄷ': 4.9813700902829767,
 '비닐': 3.4210503999103454,
 '59요금제': 15.385753633967701,
 '멈추': 7.1066711268809808,
 '검수': 1.9368491157122432,
 '들림': 3.2957730646135679,
 '늘리': 2.9206866036031642,
 '유격때문': 2.4531361781691832,
 '메세지': 2.6819008951163035,
 '몰레': 20.77399908223574,
 '판매자': 9.8103339783950751,
 '기본기': 2.8860524491200037,
 '뒷부분': 2.0596118225345492,
 '두시': 1.9934696534559309,
 '기울': 3.9919589458652314,
 '저번주': 6.5130994151058186,
 '심해': 8.7736637698652835,
 '내방': 12.914567682347272,
 '밴드': 36.299120120101406,
 '타격': 2.8486071418621477,
 '제일': 24.992559751249217,
 '이벤트': 45.906233267885334,
 '히트': 3.6212376140879794,
 '아마존': 4.486738032889737,
 '어떤': 25.536814467667249,
 '여행가': 3.2383038991735664,
 '쾌적': 3.8776126096343391,
 '기계': 19.683344980851292,
 '지금까지': 14.358557633833817,
 '고속': 13.25108467082922,
 '자꾸': 19.443937574746091,
 '언제쯤': 7.1025220967935399,
 '그동안': 8.9940574422983026,
 '전후': 2.8355927722100729,
 '상품': 8.3106422558199995,
 '여유': 3.6307436957597603,
 '착탈': 3.6957758605471418,
 '판단': 6.8291655635593189,
 '일어나': 1.9564593816732607,
 'ㅅㅋㄱㅂ': 33.573983928503658,
 '시세가': 3.051510739674637,
 '반대편': 2.1584673339479905,
 '올려놓': 3.0458909949833739,
 '사람인': 1.6413800583464662,
 '하니': 22.495116143888414,
 '무시': 3.5595970040448033,
 '월드': 3.2700096254844575,
 '대충': 18.842066804856152,
 '재생': 12.173103086811359,
 '시리즈': 14.641694194845831,
 '미지원': 2.2971138148134029,
 '업자': 5.1872079555723012,
 '못가': 2.8961241016957291,
 'by': 4.1634541916519652,
 '대용량': 4.5924982781169552,
 '안계': 2.4799662593268663,
 '전에': 8.2023907493022516,
 '망해': 2.3837991981766664,
 '둘째': 2.2412812818206964,
 '화요일': 2.8815616520873277,
 '말도': 4.5417688252386155,
 '계신': 27.969248857080906,
 '버전': 12.395361270015846,
 '깜빡': 3.4800518195869734,
 's6엣지': 2.3170427758418626,
 '정지': 2.250332667152839,
 '요금': 83.657255949289976,
 '사용환경': 1.2433281853993017,
 '공감': 1.552119059162177,
 '아쉬운점': 2.0105284723666901,
 '바깥': 2.1400772876892482,
 '이동하': 4.0964574233222173,
 '무언': 3.0530508191565211,
 '안녕': 32.093151849514044,
 '분할': 1.8086287569656487,
 '좋긴': 3.2418801433324469,
 '신박': 4.776194521370126,
 '이만': 1.6125449590134167,
 '알지': 3.0265907406924271,
 '짜증': 6.447933132230931,
 '한해': 1.8471092219896084,
 '동그라미': 3.4734725611484625,
 'lg꺼': 2.0489889615166295,
 '일요일': 4.0393714482936076,
 '분이': 9.7530012058382791,
 '용도': 6.9124548628479081,
 '순정': 4.8313450906673312,
 '사용': 143.05304496597481,
 '분야': 1.5541991823157857,
 '자전거': 3.5162796626974653,
 '나오는거': 4.5965986068068529,
 '유격': 110.69394623277742,
 '오프라인': 6.6402598515072242,
 '중심': 2.4305379265544653,
 'i3': 7.3999415231453822,
 '구매가': 4.5305828824458212,
 ...}

File write by csv


In [12]:
with open('tf.csv', 'w') as f:  # Just use 'w' mode in 3.x
    w = csv.writer(f, delimiter=',', lineterminator='\n')
    for key, value in tf_word_dict.items():
        w.writerow([key, value])

with open('tfidf.csv', 'w') as f:
    w = csv.writer(f, delimiter=',', lineterminator='\n')
    for key, value in tfidf_word_dict.items():
        w.writerow([key, value])

Co-occurence Matrix


In [13]:
tf_cooccur = (tf_bow.T * tf_bow) # co-occurrence matrix in sparse csr format
tf_cooccur.setdiag(0) # fill same word cooccurence to 0
print(tf_cooccur.todense()) # print out matrix in dense format
tf_cooccur


[[0 3 0 ..., 0 0 0]
 [3 0 0 ..., 2 1 0]
 [0 0 0 ..., 0 0 1]
 ..., 
 [0 2 0 ..., 0 0 1]
 [0 1 0 ..., 0 0 0]
 [0 0 1 ..., 1 0 0]]
Out[13]:
<3578x3578 sparse matrix of type '<class 'numpy.int64'>'
	with 3661940 stored elements in Compressed Sparse Column format>

In [14]:
# create co-occurence matrix to pandas dataframe
tf_df = pd.DataFrame(data=tf_cooccur.todense(),
             index=tf_corpus.vocabulary_.keys(),
             columns=tf_corpus.vocabulary_.keys())
tf_df[0:10]


Out[14]:
ls2d 사과 기회 게다 장기적 사지 판매 파지 광탈 인해 ... 들뜨 올라오 나머지 풍경 feat 예민 모듈식 분리 안받 하이엔드
ls2d 0 3 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
사과 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 2 1 0
기회 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 1
게다 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
장기적 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
사지 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 2 0 0
판매 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
파지 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
광탈 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
인해 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

10 rows × 3578 columns


In [15]:
# co-occurence matrix to csv
tf_df.to_csv('tf_cooccur.csv', index=True, sep=';')

In [16]:
# use Korean font, set mask image
font_path = 'C:/Windows/Fonts/NanumBarunGothicBold.otf'
mask_image = np.array(Image.open('D:/Document/project/HYStudy/scripts/[HYStudy 17th] mask_image.jpg'))

# Generate a word cloud image and display
wordcloud = WordCloud(max_font_size=72,
                      font_path=font_path,
                      background_color='white',
                      mask=mask_image).generate_from_frequencies(tf_word_dict)

plt.figure(figsize=(30, 90))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()